#importing Tikinter
import pandas as pd
import numpy as np
import tkinter as tk
from tkinter.filedialog import askopenfilename
root = tk.Tk()
root.withdraw() #Prevents the Tkinter window to come up
csvpath = askopenfilename()
root.destroy()
#df = pd.read_csv(csvpath) # based on the file path you can change the code here
df = pd.read_excel(csvpath) # for xlsx file
print("{:,} records for Forcasting data".format(len(df)))
df.info()
we have 2 object column 1 date time column and rest all columns are numeric
for column in df.columns:
if df[column].dtypes == 'object':
print(str(column)+ ' : ' +str(df[column].unique()))
print(df[column].value_counts())
print('*****************************************************************************************')
print('\n')
df.describe()
df.isnull().sum()
we can notice that we have null value present for EVENT . and the event value contains nan and covid value so will replace nan value with normal later
import seaborn as sns
import matplotlib.pyplot as plt
plt.subplots(figsize=(16,4))
sns.countplot(x='product_id',data=df,palette='muted',order= df['product_id'].value_counts().index)
plt.title('Count of product Types')
plt.xlabel('product Type')
plt.ylabel('count of product type')
plt.xticks(rotation=90)
plt.show()
print(df['product_id'].value_counts())
we can observe more number of product count over the year is for product 2 for the count of 1093
plt.subplots(figsize=(16,4))
sns.countplot(x='warehouse',data=df,palette='muted',order= df['warehouse'].value_counts().index)
plt.title('Count of warehouse')
plt.xlabel('warehouse Type')
plt.ylabel('count of warehouse')
plt.xticks(rotation=90)
plt.show()
print(df['warehouse'].value_counts())
we can observe more number of warehouse count for which product has been dispatched or store is Warehouse2 for 1814 counts
df.groupby('product_id')[['orders_count','buyers_count','units_sold','Sales value']].sum()
above we can observe the sales value over the complete year for each products
import matplotlib.pyplot as plt
import plotly.express as px
fig = px.bar(df.groupby('product_id')['Sales value'].sum().sort_values(ascending=False), y='Sales value',title='Sales value over the year for different products')
fig.show()
fig = px.bar(df.groupby('product_id')['units_sold'].sum().sort_values(ascending=False), y='units_sold',title='unit sold over the year for different products')
fig.show()
fig = px.bar(df.groupby('product_id')['buyers_count'].sum().sort_values(ascending=False), y='buyers_count',title='buyer count over the year for different products')
fig.show()
fig = px.bar(df.groupby('product_id')['Price_at_sale'].mean().sort_values(ascending=False), y='Price_at_sale',title='Mean Price_at_sale over the year for different products')
fig.show()
# lets just replace the nan value with normal time
df['Event']=df['Event'].replace(np.nan,'normal')
df['Event'].value_counts()
df_covid=df.loc[df['Event'] == 'covid']
df_covid
df_covid.groupby('product_id')[['orders_count','buyers_count','units_sold','Sales value']].sum()
-- above we can observe the sales value for covid time period for each products.
-- Here in our Dataset covid period is in the month April 4th month.
fig = px.bar(df_covid.groupby('product_id')['Sales value'].sum().sort_values(ascending=False), y='Sales value',title='Sales value during covid for different products')
fig.show()
fig = px.bar(df_covid.groupby('product_id')['units_sold'].sum().sort_values(ascending=False), y='units_sold',title='unit sold during covid for different products')
fig.show()
fig = px.bar(df_covid.groupby('product_id')['buyers_count'].sum().sort_values(ascending=False), y='buyers_count',title='buyer count during covid for different products')
fig.show()
fig = px.bar(df_covid.groupby('product_id')['Price_at_sale'].mean().sort_values(ascending=False), y='Price_at_sale',title='Mean Price_at_sale during covid for different products')
fig.show()
df['month'] = pd.DatetimeIndex(df['date_sold']).month
df.groupby('month')['Event'].value_counts()
import matplotlib.pyplot as plt
import plotly.express as px
fig = px.bar(df.groupby('month')['Sales value'].sum().sort_values(ascending=False), y='Sales value',title='Sales value each month')
fig.show()
--- Above we can observe the sales value irrespective of product for each months
import matplotlib.pyplot as plt
import plotly.express as px
fig = px.bar(df.groupby('warehouse')['Sales value'].sum().sort_values(ascending=False), y='Sales value',title='Sales value over the year from different warehouse')
fig.show()
fig = px.bar(df.groupby('warehouse')['units_sold'].sum().sort_values(ascending=False), y='units_sold',title='units_sold over the year from different warehouse')
fig.show()
df.groupby('warehouse')['product_id'].value_counts().plot(kind="bar",figsize=(15,5),color='purple')
print(df.groupby('warehouse')['product_id'].value_counts())
why i used fb prophet because it works well for seasonality and trend data and for forcasting. as i also tried arima model but its not working good as fbprophet is working.
from fbprophet import Prophet
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['warehouse']= label_encoder.fit_transform(df['warehouse'])
df['Event']= label_encoder.fit_transform(df['Event'])
df_final=df[['warehouse', 'product_id', 'date_sold', 'orders_count', 'buyers_count','units_sold', 'Event', 'Price_at_sale', 'Promotions', 'Sales value']].rename({'date_sold':'ds','Sales value':'y'}, axis='columns')
plt.figure(figsize=(10,4))
sns.heatmap(df_final.corr(),annot=True,fmt='.0%',cmap='rainbow')
plt.figure(figsize=(8,4))
df_final.drop('y',axis=1).corrwith(df_final['y']).plot(kind='bar',grid=True)
plt.title('Corelation with Target variables')
-- Above we can observe that all are highly likely corelated accept of warehouse , and i think we need to bother more about products rather than warehouse because the product are equally distributed in every warehouse. so i am dropping warehouse column before training my model
df_final.drop(columns='warehouse',inplace=True)
product1 = df_final[df_final['product_id'] == 'P1']
targetp1 = pd.DataFrame()
from fbprophet.plot import add_changepoints_to_plot
m1 = Prophet(interval_width=0.95,yearly_seasonality=True,daily_seasonality=True,changepoint_prior_scale=.1)
m1.fit(product1)
future = m1.make_future_dataframe(periods=31,freq='D')
future.loc[(future['ds'] >= pd.to_datetime('2022-01-01')) & (future['ds'] <= pd.to_datetime('2022-01-31')), 'Event'] = 0
forecast1 = m1.predict(future)
m1.plot(forecast1)
forecast1 = forecast1.rename(columns={'yhat': 'yhat_'})
targetp1 = pd.merge(targetp1, forecast1.set_index('ds'), how='outer', left_index=True, right_index=True)
targetp1.columns
from fbprophet.diagnostics import cross_validation
from fbprophet.diagnostics import performance_metrics
from fbprophet.plot import plot_cross_validation_metric
import warnings
warnings.filterwarnings("ignore")
df_cv1 = cross_validation(m1, initial='340 days', period='30 days', horizon = '20 days')
df_p1 = performance_metrics(df_cv1)
fig = plot_cross_validation_metric(df_cv1, metric='mape')
plt.show()
print('mape:', '{:.2f}%'.format(100 * df_p1['mape'].mean()))
we got a considerable mape error, so my model is working well .
product2 = df_final[df_final['product_id'] == 'P2']
targetp2 = pd.DataFrame()
from fbprophet.plot import add_changepoints_to_plot
m2 = Prophet(interval_width=0.95,yearly_seasonality=True,daily_seasonality=True,changepoint_prior_scale=.1)
m2.fit(product2)
future = m2.make_future_dataframe(periods=31,freq='D')
future.loc[(future['ds'] >= pd.to_datetime('2022-01-01')) & (future['ds'] <= pd.to_datetime('2022-01-31')), 'Event'] = 0
future.loc[(future['ds'] >= pd.to_datetime('2022-01-01')) & (future['ds'] <= pd.to_datetime('2022-01-31')), 'Promotions'] = product2['Promotions']*0.2+product2['Promotions']
future.loc[(future['ds'] >= pd.to_datetime('2022-01-01')) & (future['ds'] <= pd.to_datetime('2022-01-31')), 'buyers_count'] = product2['buyers_count']*0.22+product2['buyers_count']
forecast2 = m2.predict(future)
m2.plot(forecast2)
forecast2 = forecast2.rename(columns={'yhat': 'yhat_'})
targetp2 = pd.merge(targetp2, forecast2.set_index('ds'), how='outer', left_index=True, right_index=True)
df_cv2 = cross_validation(m2, initial='340 days', period='30 days', horizon = '20 days')
df_p2 = performance_metrics(df_cv2)
fig = plot_cross_validation_metric(df_cv2, metric='mape')
plt.show()
print('mape:', '{:.2f}%'.format(100 * df_p2['mape'].mean()))
product3 = df_final[df_final['product_id'] == 'P3']
targetp3 = pd.DataFrame()
from fbprophet.plot import add_changepoints_to_plot
m3 = Prophet(interval_width=0.95,yearly_seasonality=True,daily_seasonality=True,changepoint_prior_scale=.1)
m3.fit(product3)
future = m3.make_future_dataframe(periods=31,freq='D')
future.loc[(future['ds'] >= pd.to_datetime('2022-01-01')) & (future['ds'] <= pd.to_datetime('2022-01-31')), 'Event'] = 0
future.loc[(future['ds'] >= pd.to_datetime('2022-01-01')) & (future['ds'] <= pd.to_datetime('2022-01-31')), 'Price_at_sale'] = product3['Price_at_sale']*0.15+product3['Price_at_sale']
forecast3 = m3.predict(future)
m3.plot(forecast3)
forecast3 = forecast3.rename(columns={'yhat': 'yhat_'})
targetp3 = pd.merge(targetp3, forecast3.set_index('ds'), how='outer', left_index=True, right_index=True)
product4 = df_final[df_final['product_id'] == 'P4']
targetp4 = pd.DataFrame()
from fbprophet.plot import add_changepoints_to_plot
m4 = Prophet(interval_width=0.95,yearly_seasonality=True,daily_seasonality=True,changepoint_prior_scale=.1)
m4.fit(product4)
future = m4.make_future_dataframe(periods=31,freq='D')
future.loc[(future['ds'] >= pd.to_datetime('2022-01-01')) & (future['ds'] <= pd.to_datetime('2022-01-31')), 'Event'] = 0
future.loc[(future['ds'] >= pd.to_datetime('2022-01-01')) & (future['ds'] <= pd.to_datetime('2022-01-31')), 'Price_at_sale'] =product4['Price_at_sale'] - product4['Price_at_sale']*0.05
future.loc[(future['ds'] >= pd.to_datetime('2022-01-01')) & (future['ds'] <= pd.to_datetime('2022-01-31')), 'buyers_count'] =product4['buyers_count'] - product4['buyers_count']*0.14
forecast4 = m4.predict(future)
m4.plot(forecast4)
forecast4 = forecast4.rename(columns={'yhat': 'yhat_'})
targetp4 = pd.merge(targetp4, forecast4.set_index('ds'), how='outer', left_index=True, right_index=True)
product5 = df_final[df_final['product_id'] == 'P5']
targetp5 = pd.DataFrame()
from fbprophet.plot import add_changepoints_to_plot
m5 = Prophet(interval_width=0.95,yearly_seasonality=True,daily_seasonality=True,changepoint_prior_scale=.1)
m5.fit(product5)
future = m5.make_future_dataframe(periods=31,freq='D')
future.loc[(future['ds'] >= pd.to_datetime('2022-01-01')) & (future['ds'] <= pd.to_datetime('2022-01-31')), 'Event'] = 0
future.loc[(future['ds'] >= pd.to_datetime('2022-01-01')) & (future['ds'] <= pd.to_datetime('2022-01-31')), 'Price_at_sale'] =product5['Price_at_sale'] - product5['Price_at_sale']*0.05
future.loc[(future['ds'] >= pd.to_datetime('2022-01-01')) & (future['ds'] <= pd.to_datetime('2022-01-31')), 'buyers_count'] =product5['buyers_count'] - product5['buyers_count']*0.14
forecast5 = m5.predict(future)
m5.plot(forecast5)
forecast5 = forecast5.rename(columns={'yhat': 'yhat_'})
targetp5 = pd.merge(targetp5, forecast5.set_index('ds'), how='outer', left_index=True, right_index=True)
pwise_sale=pd.DataFrame()
pwise_sale['Predicted_p1']=targetp1['yhat_']
pwise_sale['Predicted_p2']=targetp2['yhat_']
pwise_sale['Predicted_p3']=targetp3['yhat_']
pwise_sale['Predicted_p4']=targetp4['yhat_']
pwise_sale['Predicted_p5']=targetp5['yhat_']
pwise_sale.reset_index(inplace=True)
pwise_sale = pwise_sale[(pwise_sale['ds'] > '2021-12-31') & (pwise_sale['ds'] <= '2022-01-31')]
pwise_sale.sum()
fig = px.bar(pwise_sale.groupby('ds')['Predicted_p1'].sum().sort_values(ascending=False), y='Predicted_p1',title='predicted Sales value for product 1')
fig.show()
fig = px.bar(pwise_sale.groupby('ds')['Predicted_p2'].sum().sort_values(ascending=False), y='Predicted_p2',title='predicted Sales value for product 2')
fig.show()
fig = px.bar(pwise_sale.groupby('ds')['Predicted_p3'].sum().sort_values(ascending=False), y='Predicted_p3',title='predicted Sales value for product 3')
fig.show()
fig = px.bar(pwise_sale.groupby('ds')['Predicted_p4'].sum().sort_values(ascending=False), y='Predicted_p4',title='predicted Sales value for product 4')
fig.show()
fig = px.bar(pwise_sale.groupby('ds')['Predicted_p5'].sum().sort_values(ascending=False), y='Predicted_p5',title='predicted Sales value for product 5')
fig.show()
above we can see all the predicted data for the month of january
pwise_sale.to_csv("sales_predicted.csv")